More ggplot2 with tidyverse

Leo Lu


More ggplot2 with tidyverse

2018-03-05

呂奕 Leo Lu
台大工管
目前於金融業服務



How to use this slides

大綱

複習 ggplot2

Aesthetic Mapping

Aesthetics 基本題 1

Aesthetic Mappings

在 x-y 二維的 Scatterplot 加入第三個 aesthetic

Aesthetic Mappings

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, color = class))

Aesthetics 基本題 2

Aesthetic Mappings

ggplot(data = <DATA>) + # Data
  geom_<xxx>(
     mapping = aes(<MAPPINGS>), ##  <= Aesthetic mappings
     stat = <STAT>,
     position = <POSITION>
  ) +
  scale_<xxx>() + coord_<xxx>() + facet_<xxx>()
  theme_()

Static Aesthetic

有時候你可能只想要手動設定某個固定 aesthetic,這個例子的設定只為了美觀, 並不會帶出多餘資料訊息。

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy), color = "blue")

Visualise the model

The best stats you’ve ever seen | Hans Rosling

Take a look at our data

library(gapminder)
gapminder
#> # A tibble: 1,704 x 6
#>    country     continent  year lifeExp      pop gdpPercap
#>    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
#>  1 Afghanistan Asia       1952    28.8  8425333      779.
#>  2 Afghanistan Asia       1957    30.3  9240934      821.
#>  3 Afghanistan Asia       1962    32.0 10267083      853.
#>  4 Afghanistan Asia       1967    34.0 11537966      836.
#>  5 Afghanistan Asia       1972    36.1 13079460      740.
#>  6 Afghanistan Asia       1977    38.4 14880372      786.
#>  7 Afghanistan Asia       1982    39.9 12881816      978.
#>  8 Afghanistan Asia       1987    40.8 13867957      852.
#>  9 Afghanistan Asia       1992    41.7 16317921      649.
#> 10 Afghanistan Asia       1997    41.8 22227415      635.
#> # ... with 1,694 more rows

Our ggplot

Fit a model to each country

R packages

  1. Nested data (tidyr)
  2. Functional programming (purrr)
  3. Models → tidy data (broom)

Split our data into data.frames by group

Split our data

library(dplyr)
library(tidyr)

gapminder <- gapminder %>% mutate(year1950 = year - 1950)
by_country <- gapminder %>% 
  group_by(continent, country) %>% 
  nest

by_country
#> # A tibble: 142 x 3
#>    continent country     data             
#>    <fct>     <fct>       <list>           
#>  1 Asia      Afghanistan <tibble [12 × 5]>
#>  2 Europe    Albania     <tibble [12 × 5]>
#>  3 Africa    Algeria     <tibble [12 × 5]>
#>  4 Africa    Angola      <tibble [12 × 5]>
#>  5 Americas  Argentina   <tibble [12 × 5]>
#>  6 Oceania   Australia   <tibble [12 × 5]>
#>  7 Europe    Austria     <tibble [12 × 5]>
#>  8 Asia      Bahrain     <tibble [12 × 5]>
#>  9 Asia      Bangladesh  <tibble [12 × 5]>
#> 10 Europe    Belgium     <tibble [12 × 5]>
#> # ... with 132 more rows

Fit a model within each country

lm(lifeExp ~ year, data = Afghanistan)
lm(lifeExp ~ year, data = Afghanistan)
...

We can use purrr::map() to fit each model

library(purrr)

country_model <- function(df) {
  lm(lifeExp ~ year1950, data = df)
}

models <- by_country %>%
  mutate(mod_lm = map(data, country_model))

models
#> # A tibble: 142 x 4
#>    continent country     data              mod_lm  
#>    <fct>     <fct>       <list>            <list>  
#>  1 Asia      Afghanistan <tibble [12 × 5]> <S3: lm>
#>  2 Europe    Albania     <tibble [12 × 5]> <S3: lm>
#>  3 Africa    Algeria     <tibble [12 × 5]> <S3: lm>
#>  4 Africa    Angola      <tibble [12 × 5]> <S3: lm>
#>  5 Americas  Argentina   <tibble [12 × 5]> <S3: lm>
#>  6 Oceania   Australia   <tibble [12 × 5]> <S3: lm>
#>  7 Europe    Austria     <tibble [12 × 5]> <S3: lm>
#>  8 Asia      Bahrain     <tibble [12 × 5]> <S3: lm>
#>  9 Asia      Bangladesh  <tibble [12 × 5]> <S3: lm>
#> 10 Europe    Belgium     <tibble [12 × 5]> <S3: lm>
#> # ... with 132 more rows

What can we extract from each model?

Extract info from model

models <- models %>% 
  mutate(
    glance = mod_lm %>% map(broom::glance),
    tidy = mod_lm %>% map(broom::tidy),
    augment = mod_lm %>% map(broom::augment),
    rsq = glance %>% map_dbl("r.squared")
  )
models
#> # A tibble: 142 x 8
#>    continent country     data     mod_lm  glance   tidy    augment     rsq
#>    <fct>     <fct>       <list>   <list>  <list>   <list>  <list>    <dbl>
#>  1 Asia      Afghanistan <tibble… <S3: l… <data.f… <data.… <data.fr… 0.948
#>  2 Europe    Albania     <tibble… <S3: l… <data.f… <data.… <data.fr… 0.911
#>  3 Africa    Algeria     <tibble… <S3: l… <data.f… <data.… <data.fr… 0.985
#>  4 Africa    Angola      <tibble… <S3: l… <data.f… <data.… <data.fr… 0.888
#>  5 Americas  Argentina   <tibble… <S3: l… <data.f… <data.… <data.fr… 0.996
#>  6 Oceania   Australia   <tibble… <S3: l… <data.f… <data.… <data.fr… 0.980
#>  7 Europe    Austria     <tibble… <S3: l… <data.f… <data.… <data.fr… 0.992
#>  8 Asia      Bahrain     <tibble… <S3: l… <data.f… <data.… <data.fr… 0.967
#>  9 Asia      Bangladesh  <tibble… <S3: l… <data.f… <data.… <data.fr… 0.989
#> 10 Europe    Belgium     <tibble… <S3: l… <data.f… <data.… <data.fr… 0.995
#> # ... with 132 more rows

See the R^2

models %>% 
  ggplot(aes(rsq, reorder(country, rsq))) +  # use factor levels
  geom_point(aes(colour = continent))

Unnest to a regular data frame

models %>% unnest(data)
#> # A tibble: 1,704 x 8
#>    continent country       rsq  year lifeExp      pop gdpPercap year1950
#>    <fct>     <fct>       <dbl> <int>   <dbl>    <int>     <dbl>    <dbl>
#>  1 Asia      Afghanistan 0.948  1952    28.8  8425333      779.       2.
#>  2 Asia      Afghanistan 0.948  1957    30.3  9240934      821.       7.
#>  3 Asia      Afghanistan 0.948  1962    32.0 10267083      853.      12.
#>  4 Asia      Afghanistan 0.948  1967    34.0 11537966      836.      17.
#>  5 Asia      Afghanistan 0.948  1972    36.1 13079460      740.      22.
#>  6 Asia      Afghanistan 0.948  1977    38.4 14880372      786.      27.
#>  7 Asia      Afghanistan 0.948  1982    39.9 12881816      978.      32.
#>  8 Asia      Afghanistan 0.948  1987    40.8 13867957      852.      37.
#>  9 Asia      Afghanistan 0.948  1992    41.7 16317921      649.      42.
#> 10 Asia      Afghanistan 0.948  1997    41.8 22227415      635.      47.
#> # ... with 1,694 more rows

Unnest to a regular data frame

models %>% unnest(glance, .drop = TRUE)
#> # A tibble: 142 x 14
#>    continent country       rsq r.squared adj.r.squared sigma statistic
#>    <fct>     <fct>       <dbl>     <dbl>         <dbl> <dbl>     <dbl>
#>  1 Asia      Afghanistan 0.948     0.948         0.942 1.22      181. 
#>  2 Europe    Albania     0.911     0.911         0.902 1.98      102. 
#>  3 Africa    Algeria     0.985     0.985         0.984 1.32      662. 
#>  4 Africa    Angola      0.888     0.888         0.877 1.41       79.1
#>  5 Americas  Argentina   0.996     0.996         0.995 0.292    2246. 
#>  6 Oceania   Australia   0.980     0.980         0.978 0.621     481. 
#>  7 Europe    Austria     0.992     0.992         0.991 0.407    1261. 
#>  8 Asia      Bahrain     0.967     0.967         0.963 1.64      291. 
#>  9 Asia      Bangladesh  0.989     0.989         0.988 0.977     930. 
#> 10 Europe    Belgium     0.995     0.995         0.994 0.293    1822. 
#> # ... with 132 more rows, and 7 more variables: p.value <dbl>, df <int>,
#> #   logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>, df.residual <int>

Unnest to a regular data frame

models %>% unnest(tidy, .drop = TRUE)
#> # A tibble: 284 x 8
#>    continent country       rsq term  estimate std.error statistic  p.value
#>    <fct>     <fct>       <dbl> <chr>    <dbl>     <dbl>     <dbl>    <dbl>
#>  1 Asia      Afghanistan 0.948 (Int…   29.4     0.699       42.0  1.40e-12
#>  2 Asia      Afghanistan 0.948 year…    0.275   0.0205      13.5  9.84e- 8
#>  3 Europe    Albania     0.911 (Int…   58.6     1.13        51.7  1.79e-13
#>  4 Europe    Albania     0.911 year…    0.335   0.0332      10.1  1.46e- 6
#>  5 Africa    Algeria     0.985 (Int…   42.2     0.756       55.8  8.22e-14
#>  6 Africa    Algeria     0.985 year…    0.569   0.0221      25.7  1.81e-10
#>  7 Africa    Angola      0.888 (Int…   31.7     0.804       39.4  2.63e-12
#>  8 Africa    Angola      0.888 year…    0.209   0.0235       8.90 4.59e- 6
#>  9 Americas  Argentina   0.996 (Int…   62.2     0.167      372.   4.80e-22
#> 10 Americas  Argentina   0.996 year…    0.232   0.00489     47.4  4.22e-13
#> # ... with 274 more rows

Plot the models

models %>% 
  unnest(tidy) %>% 
  select(continent, country, term, estimate, rsq) %>% 
  spread(key = term, value = estimate) %>% 
  ggplot(aes(`(Intercept)`, year1950)) +
  geom_point(aes(colour = continent, size = rsq)) +
  geom_smooth(se = FALSE)

Plot the models: augmented

models %>% 
  unnest(augment) %>% 
  ggplot(aes(year1950, .resid)) +
  geom_line(aes(group = country), alpha = 0.3) +
  geom_smooth(se = FALSE) +
  geom_hline(yintercept = 0, colour = "red", alpha = 0.7) +
  facet_wrap(~continent)

Quick Recap

  1. tidyr: 把物件 (例如 lm) 用 list 存在 columns 裡面
  2. purrr: Functional programming
  3. broom: Models → tidy data

ggplot2 FAQ

How do I manually change the key labels in a legend in ggplot2

library(ggplot2)

## data
set.seed(2016)
grp <- gl(n=4, k=20, labels=c("group a","group b","group c", "group d"))
value <- runif(n=80, min=10, max=150)
outcome <- cut(value,2)
(data <- data.frame(grp,value,outcome))
#>        grp     value     outcome
#> 1  group a  35.22290 (10.2,76.3]
#> 2  group a  30.01212 (10.2,76.3]
#> 3  group a 127.83051  (76.3,142]
#> 4  group a  28.70045 (10.2,76.3]
#> 5  group a  76.85034  (76.3,142]
#> 6  group a  26.97618 (10.2,76.3]
#> 7  group a  96.32838  (76.3,142]
#> 8  group a 134.67666  (76.3,142]
#> 9  group a  10.36728 (10.2,76.3]
#> 10 group a  17.48423 (10.2,76.3]
#> 11 group a  64.41631 (10.2,76.3]
#> 12 group a  48.21350 (10.2,76.3]
#> 13 group a  41.16120 (10.2,76.3]
#> 14 group a 132.72041  (76.3,142]
#> 15 group a  44.53364 (10.2,76.3]
#> 16 group a  65.06026 (10.2,76.3]
#> 17 group a 100.05053  (76.3,142]
#> 18 group a  28.45157 (10.2,76.3]
#> 19 group a  90.16397  (76.3,142]
#> 20 group a  22.45060 (10.2,76.3]
#> 21 group b  56.60950 (10.2,76.3]
#> 22 group b 105.97659  (76.3,142]
#> 23 group b  64.47677 (10.2,76.3]
#> 24 group b  88.47872  (76.3,142]
#> 25 group b 137.36254  (76.3,142]
#> 26 group b  93.86846  (76.3,142]
#> 27 group b  35.56070 (10.2,76.3]
#> 28 group b  95.71090  (76.3,142]
#> 29 group b 142.24888  (76.3,142]
#> 30 group b  70.01091 (10.2,76.3]
#> 31 group b  98.30474  (76.3,142]
#> 32 group b 124.95707  (76.3,142]
#> 33 group b  36.63055 (10.2,76.3]
#> 34 group b 133.01564  (76.3,142]
#> 35 group b 129.15391  (76.3,142]
#> 36 group b  30.68462 (10.2,76.3]
#> 37 group b  43.79335 (10.2,76.3]
#> 38 group b  49.33383 (10.2,76.3]
#> 39 group b  43.93878 (10.2,76.3]
#> 40 group b 104.30316  (76.3,142]
#> 41 group c  57.25786 (10.2,76.3]
#> 42 group c  23.86566 (10.2,76.3]
#> 43 group c  45.88386 (10.2,76.3]
#> 44 group c 103.77243  (76.3,142]
#> 45 group c  62.16864 (10.2,76.3]
#> 46 group c  48.48868 (10.2,76.3]
#> 47 group c 111.38763  (76.3,142]
#> 48 group c  53.71373 (10.2,76.3]
#> 49 group c  89.98226  (76.3,142]
#> 50 group c  77.86725  (76.3,142]
#> 51 group c  32.09404 (10.2,76.3]
#> 52 group c  68.03531 (10.2,76.3]
#> 53 group c  46.23794 (10.2,76.3]
#> 54 group c  62.36134 (10.2,76.3]
#> 55 group c 137.55317  (76.3,142]
#> 56 group c  79.68930  (76.3,142]
#> 57 group c 131.94144  (76.3,142]
#> 58 group c  18.01237 (10.2,76.3]
#> 59 group c  29.73031 (10.2,76.3]
#> 60 group c  63.13070 (10.2,76.3]
#> 61 group d  32.43695 (10.2,76.3]
#> 62 group d  80.13010  (76.3,142]
#> 63 group d  75.26836 (10.2,76.3]
#> 64 group d  34.06693 (10.2,76.3]
#> 65 group d 134.69061  (76.3,142]
#> 66 group d 130.04567  (76.3,142]
#> 67 group d  19.32048 (10.2,76.3]
#> 68 group d  23.05680 (10.2,76.3]
#> 69 group d  82.97590  (76.3,142]
#> 70 group d 133.07933  (76.3,142]
#> 71 group d 137.68230  (76.3,142]
#> 72 group d 142.15229  (76.3,142]
#> 73 group d  38.12144 (10.2,76.3]
#> 74 group d 106.15707  (76.3,142]
#> 75 group d  39.43386 (10.2,76.3]
#> 76 group d  19.20592 (10.2,76.3]
#> 77 group d  53.36572 (10.2,76.3]
#> 78 group d 132.45927  (76.3,142]
#> 79 group d  19.52064 (10.2,76.3]
#> 80 group d  33.12068 (10.2,76.3]
## Origianl plot

ggplot(data, aes(grp, fill=outcome)) + 
  geom_bar() + 
  xlab("group") +
  ylab("number of subjects") + 
  labs(fill = "Serologic response")

# Change the displayed labels for groups

# The standard way is to use the scale functions to change the displayed labels for groups. You can replace your ggplot call with

ggplot(data, aes(grp, fill=outcome)) +
  geom_bar() +
  xlab("group") +
  ylab("number of subjects") +
  scale_fill_discrete(name="Serologic response",
                      breaks=levels(data$outcome),
                      labels=c("double negative", "positive for a and/or b"))

# Note that the scale's title has been incorporated into the scale_fill_discrete call. You can do this with the "axes" too, if you like

ggplot(data, aes(grp, fill=outcome)) +
  geom_bar() +
  scale_x_discrete("group") +  # x-axis name
  scale_y_continuous("number of subjects") + # y-axis name
  scale_fill_discrete(name="Serologic response",
                      breaks=levels(data$outcome),
                      labels=c("double negative", "positive for a and/or b"))

Change the order of a discrete x scale?

沒有排序的 bar chart 很難看

d <- mpg %>% 
  group_by(class) %>% 
  summarise(n = n())
d
#> # A tibble: 7 x 2
#>   class          n
#>   <chr>      <int>
#> 1 2seater        5
#> 2 compact       47
#> 3 midsize       41
#> 4 minivan       11
#> 5 pickup        33
#> 6 subcompact    35
#> 7 suv           62

ggplot(data = d) +
  geom_bar(mapping = aes(x = reorder(class, -n), y = n),
           stat = "identity")

Reference